In [1]:
from IPython.display import Image
In [2]:
Image('images/dataset_print.png', width=1000, height=300)
Out[2]:
Columns Description¶
- country: The country that the wine is from
- description: The WineEnthusiast description about the wine
- designation: The vineyard within the winery where the grapes that made the wine are from
- points: The number of points WineEnthusiast rated the wine on a scale of 1-100 (though they say they only post reviews for
- price: The cost for a bottle of the wine
- province: The province or state that the wine is from
- region_1: The wine growing area in a province or state (ie Napa)
- region_2: Sometimes there are more specific regions specified within a wine growing area (ie Rutherford inside the Napa Valley), but this
- taster_name: WineEnthusiast name
- taster_twitter_handle: WineEnthusiast Twitter account
- title: The title of the wine review, which often contains the vintage if you're interested in extracting that feature
- variety: The type of grapes used to make the wine (ie Pinot Noir)
- winery: The winery that made the wine!
Let's Code¶
In [3]:
import numpy as np
import pandas as pd
from unidecode import unidecode
import re
from scipy import stats
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
import missingno
import random
random.seed(911)
# viz
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
# some costumizations
np.random.seed(911)
plt.style.use('fivethirtyeight')
plt.rcParams['lines.linewidth'] = 2.0
pio.templates.default = "plotly_dark"
import warnings
warnings.filterwarnings('ignore')
In [4]:
df = pd.read_csv('data/winemag-data-130k-v2.csv', encoding='utf-8')
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 129971 entries, 0 to 129970 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 129971 non-null int64 1 country 129908 non-null object 2 description 129971 non-null object 3 designation 92506 non-null object 4 points 129971 non-null int64 5 price 120975 non-null float64 6 province 129908 non-null object 7 region_1 108724 non-null object 8 region_2 50511 non-null object 9 taster_name 103727 non-null object 10 taster_twitter_handle 98758 non-null object 11 title 129971 non-null object 12 variety 129970 non-null object 13 winery 129971 non-null object dtypes: float64(1), int64(2), object(11) memory usage: 13.9+ MB
In [6]:
# removing the first column
df = df.drop(columns='Unnamed: 0')
In [7]:
df.head()
Out[7]:
| country | description | designation | points | price | province | region_1 | region_2 | taster_name | taster_twitter_handle | title | variety | winery | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Italy | Aromas include tropical fruit, broom, brimston... | Vulkà Bianco | 87 | NaN | Sicily & Sardinia | Etna | NaN | Kerin O’Keefe | @kerinokeefe | Nicosia 2013 Vulkà Bianco (Etna) | White Blend | Nicosia |
| 1 | Portugal | This is ripe and fruity, a wine that is smooth... | Avidagos | 87 | 15.0 | Douro | NaN | NaN | Roger Voss | @vossroger | Quinta dos Avidagos 2011 Avidagos Red (Douro) | Portuguese Red | Quinta dos Avidagos |
| 2 | US | Tart and snappy, the flavors of lime flesh and... | NaN | 87 | 14.0 | Oregon | Willamette Valley | Willamette Valley | Paul Gregutt | @paulgwine | Rainstorm 2013 Pinot Gris (Willamette Valley) | Pinot Gris | Rainstorm |
| 3 | US | Pineapple rind, lemon pith and orange blossom ... | Reserve Late Harvest | 87 | 13.0 | Michigan | Lake Michigan Shore | NaN | Alexander Peartree | NaN | St. Julian 2013 Reserve Late Harvest Riesling ... | Riesling | St. Julian |
| 4 | US | Much like the regular bottling from 2012, this... | Vintner's Reserve Wild Child Block | 87 | 65.0 | Oregon | Willamette Valley | Willamette Valley | Paul Gregutt | @paulgwine | Sweet Cheeks 2012 Vintner's Reserve Wild Child... | Pinot Noir | Sweet Cheeks |
In [8]:
print('# Tasters \t', df['taster_name'].nunique())
print('..'*12)
print('# Wines \t', df['title'].nunique())
print('# Winerys \t', df['winery'].nunique())
print('# Countries \t', df['country'].nunique())
# Tasters 19 ........................ # Wines 118840 # Winerys 16757 # Countries 43
So it's just a learning project, I will use country instead of wine.
EDA¶
In [9]:
missingno.matrix(df, figsize=(15, 5), fontsize=10)
Out[9]:
<Axes: >
In [10]:
# Generate a choropleth map
country_wines_volume = df.groupby('country')[['title']].count().reset_index()
fig = px.choropleth(country_wines_volume,
locations="country",
locationmode = "country names",
color="title",
hover_name="country",
color_continuous_scale=px.colors.sequential.Peach)
fig.update_layout(width=1200, height=600, title={'text': '# Wines per country'})
fig.show()
In [11]:
fig = px.histogram(df[['price']], color_discrete_sequence=px.colors.sequential.Peach)
fig.update_layout(title={'text': 'Distribution of Wines Price'})
In [12]:
fig = px.histogram(df[['points']], color_discrete_sequence=px.colors.sequential.Peach)
fig.update_layout(title={'text': 'Distribution of Wines Score'})
In [13]:
print(f'Points mean: {np.mean(df.points):.2f} \tPoints trimmed mean: {stats.trim_mean(df.points, 0.1):.2f}')
print(f'Points mean: {np.mean(df.price):.2f} \tPoints trimmed mean: {stats.trim_mean(df.price, 0.1):.2f}')
Points mean: 88.45 Points trimmed mean: 88.43 Points mean: 35.36 Points trimmed mean: 33.23
In [14]:
top10_representatives_country = df['country'].value_counts()[:10].index
df_top10_representatives_country = df[df['country'].isin(top10_representatives_country)]
fig = px.ecdf(df_top10_representatives_country, x=['price'], color='country', color_discrete_sequence=px.colors.sequential.YlGn)
fig.update_layout(title={'text': 'Empirical Cumulative Distribution of feature Price'})
fig.show()
In [15]:
top10_representatives_country = df['country'].value_counts()[:10].index
df_top10_representatives_country = df[df['country'].isin(top10_representatives_country)]
fig = px.ecdf(df_top10_representatives_country, x=['points'], color='country', color_discrete_sequence=px.colors.sequential.YlGn)
fig.update_layout(title={'text': 'Empirical Cumulative Distribution of feature Points'})
In [16]:
fig = make_subplots(rows=2, cols=2,
column_widths=[0.8, 0.2],
row_heights=[0.2, 0.8],
shared_xaxes=True,
shared_yaxes=True,
vertical_spacing=0.05,
horizontal_spacing=0.05)
# Create a hexbin plot
fig.add_trace(go.Histogram2dContour(x=df["price"],
y=df["points"],
colorscale="Peach",
ncontours=7,
showscale=False,
nbinsx=30,
nbinsy=30,
hovertemplate="Price: $%{x}<br>Points: %{y}<br>Density: %{z}<extra></extra>"),
row=2, col=1)
# Create the histograms
fig.add_trace(go.Histogram(x=df["price"],
nbinsx=30,
showlegend=False,
hovertemplate="Price: $%{x}"),
row=1, col=1)
fig.add_trace(go.Histogram(y=df["points"],
nbinsy=30,
showlegend=False,
hovertemplate="Points: %{y}",
),
row=2, col=2)
# Update the layout and axes properties
fig.update_xaxes(title_text="Price", row=2, col=1)
fig.update_yaxes(title_text="Points", row=2, col=1)
fig.update_xaxes(showticklabels=False, row=1, col=1)
fig.update_yaxes(showticklabels=False, row=2, col=2)
fig.update_layout(title={"text": "Marginal Plot (Price x Points)"})
fig.show()
Users and Items <> Tasters and Wines¶
In [17]:
df = df.dropna(subset=['taster_name'])
df = df.dropna(subset=['country'])
In [18]:
df.groupby(['taster_name'])[['country']].count()
Out[18]:
| country | |
|---|---|
| taster_name | |
| Alexander Peartree | 415 |
| Anna Lee C. Iijima | 4415 |
| Anne Krebiehl MW | 3676 |
| Carrie Dykes | 139 |
| Christina Pickard | 6 |
| Fiona Adams | 27 |
| Jeff Jenssen | 469 |
| Jim Gordon | 4177 |
| Joe Czerwinski | 5145 |
| Kerin O’Keefe | 10776 |
| Lauren Buzzeo | 1832 |
| Matt Kettmann | 6332 |
| Michael Schachner | 15127 |
| Mike DeSimone | 502 |
| Paul Gregutt | 9531 |
| Roger Voss | 25512 |
| Sean P. Sullivan | 4966 |
| Susan Kostrzewa | 1080 |
| Virginie Boone | 9537 |
In [19]:
df.groupby(['country'])[['taster_name']].count()
Out[19]:
| taster_name | |
|---|---|
| country | |
| Argentina | 3797 |
| Armenia | 2 |
| Australia | 2037 |
| Austria | 3337 |
| Bosnia and Herzegovina | 2 |
| Brazil | 52 |
| Bulgaria | 141 |
| Canada | 256 |
| Chile | 4361 |
| China | 1 |
| Croatia | 73 |
| Cyprus | 11 |
| Czech Republic | 12 |
| Egypt | 1 |
| England | 74 |
| France | 21828 |
| Georgia | 86 |
| Germany | 2134 |
| Greece | 466 |
| Hungary | 145 |
| India | 9 |
| Israel | 500 |
| Italy | 11042 |
| Lebanon | 35 |
| Luxembourg | 6 |
| Macedonia | 12 |
| Mexico | 65 |
| Moldova | 59 |
| Morocco | 28 |
| New Zealand | 1311 |
| Peru | 16 |
| Portugal | 5686 |
| Romania | 120 |
| Serbia | 12 |
| Slovakia | 1 |
| Slovenia | 87 |
| South Africa | 1328 |
| Spain | 6581 |
| Switzerland | 7 |
| Turkey | 90 |
| US | 37730 |
| Ukraine | 14 |
| Uruguay | 109 |
In [20]:
df['country'] = df['country'].apply(lambda text: re.sub(r'[.,!?;:\'"()\[\]{}&@#$%^_+=×÷<>≤≥-]', '', text))
df['country'] = df['country'].apply(lambda text: unidecode(text).lower())
df['taster_name'] = df['taster_name'].apply(lambda text: re.sub(r'\xa0', '', text))
df['taster_name'] = df['taster_name'].apply(lambda text: unidecode(text).lower())
In [21]:
print(f'Number of duplicated data: {df.duplicated().sum()}')
Number of duplicated data: 8652
In [22]:
df = df.drop_duplicates()
Creating a Recommendation System | Collaborative Filtering ¶
In [23]:
# Creating the rank matrix
df_rank = df.pivot_table(index='taster_name', columns='country', values='points', aggfunc='mean')
# Convert the DataFrame to a sparse matrix
sparse_matrix = df_rank.values.T
In [24]:
print(f'#Countrys: {sparse_matrix.shape[0]}\t #Tasters:{sparse_matrix.shape[1]}')
#Countrys: 43 #Tasters:19
train - test split¶
In [25]:
def train_test_split_matrix(matrix, test_rate):
"""
Sampling data in training and test sets.
==========
Parameters
matrix: numpy array
The rating matrix
test_rate: float
The percentage of test set
======
Yields
train_matrix: numpy array
The rating training matrix
test_rate: numpy array
The rating test matrix
"""
# Extract non-NaN indices
non_nan_indices = np.argwhere(~np.isnan(matrix))
# Split these indices into train and test
train_indices, test_indices = train_test_split(non_nan_indices, test_size=test_rate, random_state=911)
# Construct train and test matrices
train_matrix = np.full(matrix.shape, np.nan)
test_matrix = np.full(matrix.shape, np.nan)
for i, j in train_indices:
train_matrix[i, j] = matrix[i, j]
for i, j in test_indices:
test_matrix[i, j] = matrix[i, j]
return train_matrix, test_matrix
In [26]:
r_train_holdout, r_test = train_test_split_matrix(sparse_matrix, test_rate=0.15)
r_train, r_valid = train_test_split_matrix(r_train_holdout, test_rate=0.15)
In [27]:
r_train.shape, r_valid.shape, r_test.shape
Out[27]:
((43, 19), (43, 19), (43, 19))
In [28]:
nulls = 0
for i in range(r_train.shape[0]):
if np.isnan(r_train[i]).sum() == r_train.shape[1]:
nulls+=1
print(f'#Countrys without rank: {nulls}')
#Countrys without rank: 9
In [29]:
nulls = 0
for i in range(r_train.shape[1]):
if np.isnan(r_train[:, i]).sum() == r_train.shape[0]:
nulls+=1
print(f'#Tasters without rank: {nulls}')
#Tasters without rank: 1
In [30]:
print('matrix density:')
np.invert(np.isnan(r_train)).sum() / (np.isnan(r_train).sum() + np.invert(np.isnan(r_train)).sum())
matrix density:
Out[30]:
0.09791921664626684
Let's focus on only one taster¶
In [31]:
tx = 12
In [32]:
df_rank.index[tx]
Out[32]:
'michael schachner'
In [33]:
r_train[:, tx]
Out[33]:
array([ nan, nan, nan, nan, nan,
84.63265306, nan, nan, 86.48703384, nan,
nan, nan, nan, nan, nan,
82.85 , nan, nan, nan, nan,
nan, nan, 89.17105263, nan, nan,
nan, nan, nan, nan, nan,
83.5625 , nan, nan, nan, nan,
nan, nan, nan, nan, nan,
nan, 86.66326531, 86.26760563])
In [34]:
tx_avg = np.nanmean(r_train[:, tx])
print(f'{df_rank.index[tx]} average rating: {tx_avg:.2f}')
michael schachner average rating: 85.66
In [35]:
global_avg = np.nanmean(r_train[:, :])
print(f'Global Rating average rating: {global_avg:.2f}')
Global Rating average rating: 87.18
... And in only one country¶
In [36]:
ci = 22
In [37]:
df_rank.columns[ci]
Out[37]:
'italy'
In [38]:
r_train[ci, :]
Out[38]:
array([ nan, nan, nan, nan, nan,
nan, nan, nan, 86.03409091, 88.89780344,
nan, nan, 89.17105263, nan, nan,
88.67708333, nan, nan, nan])
In [39]:
ci_avg = np.nanmean(r_train[ci, :])
print(f'{df_rank.columns[ci]} average rating: {ci_avg:.2f}')
italy average rating: 88.20
The True Rating¶
In [40]:
r_train[ci, tx]
Out[40]:
89.17105263157895
Global Baseline Estimate¶
In [41]:
tx_diff_avg = tx_avg - global_avg
ci_diff_avg = ci_avg - global_avg
print(f"The overall avarage difference between {df_rank.index[tx]}'s rates is {tx_diff_avg:.2f} points")
print(f"The overall avarage difference between {df_rank.columns[ci]}'s rates is {ci_diff_avg:.2f} points")
The overall avarage difference between michael schachner's rates is -1.52 points The overall avarage difference between italy's rates is 1.02 points
In [42]:
baseline_estimator = lambda t, w, g: g + t + w
baseline_estimate_txci = baseline_estimator(tx_diff_avg, ci_diff_avg, global_avg)
In [43]:
baseline_estimate_txci
Out[43]:
86.67835639015908
Others Countrys Global Baseline¶
In [44]:
def baseline(matrix, w, t):
"""
Calculate the global baseline estimate for a given (t, w)
==========
Parameters
matrix: numpy array
The rating matrix
t: int
The taster index
w: int
The country index
======
Yields
baseline_estimate: float
The estimate rating to the r(t, w)
"""
tasters_avg = np.nanmean(matrix[:, t])
country_avg = np.nanmean(matrix[w, :])
global_avg = np.nanmean(matrix[:, :])
tasters_diff_avg = tasters_avg - global_avg
country_diff_avg = country_avg - global_avg
baseline_estimate = baseline_estimator(tasters_avg, tasters_diff_avg, country_diff_avg)
return baseline_estimate
Collaborative Filtering¶
In [45]:
tasters_means = np.nanmean(r_train, axis=0)
tasters_means = np.nan_to_num(x=tasters_means, nan=global_avg) # if have new tasters, fill with global average
tasters_means
Out[45]:
array([85.78067885, 86.1990862 , 90.58788886, 86.35658915, 93. ,
86.75 , 87.67280805, 88.60355815, 86.86953692, 88.89780344,
85.38157895, 89.53080817, 85.66201578, 88.97994306, 88.13454047,
88.74629187, 89.52757765, 85.58347499, 87.17866697])
In [46]:
r_train_centered = r_train - tasters_means[None, :]
r_train_centered = np.nan_to_num(r_train_centered)
In [47]:
cosine_similarity = lambda A, B: np.dot(A, B) / (np.linalg.norm(A) * np.linalg.norm(B))
In [48]:
Countrys_cosine_similarity = [cosine_similarity(r_train_centered[ci], r_train_centered[m])
if r_train_centered[m].sum() != 0 else 0
for m in range(r_train_centered.shape[0])]
In [49]:
Countrys_cosine_similarity[:10]
Out[49]:
[0, 0.0, -0.23156768917512674, -0.018920815066425397, 0, -0.9726296382938932, 0.0, -0.04655317393310503, 0.5911017981162034, 0.0]
In [50]:
# define k neighbors
k = 5
np_Countrys_cosine_similarity = np.array(Countrys_cosine_similarity).reshape(-1, )
In [51]:
top_k = pd.DataFrame({'Countrys_sim': np_Countrys_cosine_similarity}).sort_values(by='Countrys_sim', ascending=False).iloc[1: k+1]
kj_index = top_k.index.values
print(f'Top {k} greater similar Countrys: {kj_index}')
Top 5 greater similar Countrys: [41 8 42 37 18]
In [52]:
print(f'Top {k} greater similarity: {top_k.Countrys_sim.values}')
Top 5 greater similarity: [0.97262964 0.5911018 0.33306081 0.11233192 0.00786082]
In [53]:
sum_N_rating = list()
sum_N_similarity = list()
for j in kj_index:
sij = np_Countrys_cosine_similarity[j]
rxj = r_train[j, tx]
bxj = baseline(r_train, j, tx)
sum_Nrj = sij*(rxj - bxj)
sum_N_rating.append(sum_Nrj)
sum_N_similarity.append(sij)
cf = np.nansum(sum_N_rating) / np.nansum(sum_N_similarity)
In [54]:
rxi = baseline_estimate_txci + cf
In [55]:
rxi
Out[55]:
89.26913061143844
Evaluating¶
In [56]:
def root_mean_squared_error(r_true, r_pred):
"""
Sampling data in training and test sets.
==========
Parameters
r_true: numpy array
The true rating matrix
r_true: numpy array
The prediction rating matrix
======
Yields
rmse: float
The root mean square error
"""
# Get indices where the value is not np.nan
idx = np.where(~np.isnan(r_true))
paired_idx = list(zip(idx[0], idx[1]))
N = np.invert(np.isnan(r_true)).sum()
ssr = 0
for r, c in paired_idx:
# if we have ratings to compare
if r_pred[r, c] != np.nan:
ssr += (r_pred[r, c] - r_true[r, c])**2
# if we don't have the true rating
else:
pass
rmse = (ssr/N)**0.5
return rmse
In [57]:
# Let create arrays to compare our sample rating
rxi = np.array([rxi]).reshape(1, 1)
rxr = np.array([r_train[ci, tx]]).reshape(1, 1)
In [58]:
print(f"The get RMSE by {df_rank.index[tx]}'s real rating about {df_rank.columns[ci]} and the predicted value:")
root_mean_squared_error(rxr, rxi)
The get RMSE by michael schachner's real rating about italy and the predicted value:
Out[58]:
0.09807797985949662
Let's make a full pipeline to predict ratings¶
In [59]:
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_is_fitted
In [60]:
class RatingEstimator(BaseEstimator, RegressorMixin):
"""
A wine nationalitys ratings estimator
==========
Parameters
k_neighbors: int
The number of neighbors to be involved in the mathematical rating estimator
"""
def __init__(self, k_neighbors=3):
self.k_neighbors = k_neighbors
def fit(self, ratings):
"""
Fit the estimator to the training data.
Just save the matrix.
==========
Parameters
ratings: np.array
The ratings we have
"""
self.r_matrix = ratings
# Custom attribute to track if the estimator is fitted
self._is_fitted = True
return self
def _baseline_estimate(self):
"""
The baseline estimator part
======
Yields
self.global_avg: float
The global baseline estimate
"""
try:
self.global_avg = np.nanmean(self.r_matrix[:, :])
self.tx_avg = np.nanmean(self.r_matrix[:, self.taster_idx])
self.tx_diff_avg = self.tx_avg - self.global_avg
self.ci_avg = np.nanmean(self.r_matrix[self.country_idx, :])
self.ci_diff_avg = self.ci_avg - self.global_avg
return baseline_estimator(self.tx_diff_avg, self.ci_diff_avg, self.global_avg)
except:
return self.global_avg
@staticmethod
def _calculate_cosine_similarity(ratings, ci):
"""
Calculate the cosine similarity between two vectors
==========
Parameters
ratings: np.array
The ratings we have
ci: int
The country thay will by considered
======
Yields
Countrys_cosine_similarity: float
The similarity
"""
tasters_means = np.nanmean(ratings, axis=0)
tasters_means = np.nan_to_num(x=tasters_means, nan=global_avg) # if have new tasters, fill with global average
ratings_centered = ratings - tasters_means[None, :]
ratings_centered = np.nan_to_num(ratings_centered, nan=0.0)
Countrys_cosine_similarity = [cosine_similarity(ratings_centered[ci], ratings_centered[m])
if ratings_centered[m].sum() != 0 else 0
for m in range(ratings_centered.shape[0])]
return Countrys_cosine_similarity
def _collaborative_filtering(self, k):
"""
A collaborative filtering estimation part
==========
Parameters
k: int
The number of neighbors to be involved in the mathematical rating estimator
======
Yields
cf: float
The collaborative filtering estimation
"""
try:
self.countrys_cosine_similarity = RatingEstimator._calculate_cosine_similarity(self.r_matrix, self.country_idx)
np_countrys_cosine_similarity = np.array(self.countrys_cosine_similarity).reshape(-1, )
top_k = pd.DataFrame({'countrys_sim': np_countrys_cosine_similarity}).sort_values(by='countrys_sim', ascending=False).iloc[1: k+1]
self.kj_index = top_k.index.values
self.k_sim = top_k.countrys_sim.values
sum_N_rating = list()
sum_N_similarity = list()
for j in self.kj_index:
sij = np_countrys_cosine_similarity[j]
rxj = self.r_matrix[j, self.taster_idx]
bxj = baseline(self.r_matrix, j, self.taster_idx)
sum_Nrj = sij*(rxj - bxj)
sum_N_rating.append(sum_Nrj)
sum_N_similarity.append(sij)
cf = np.nansum(sum_N_rating) / np.nansum(sum_N_similarity)
return cf
except:
return 0.0
def predict(self, taster, country):
"""
Perform Predictions
======
Yields
final_estimation: float
Fulled estimation
"""
self.taster_idx = taster
self.country_idx = country
self.baseline_estimate_txci = RatingEstimator._baseline_estimate(self)
self.cf_txci = RatingEstimator._collaborative_filtering(self, self.k_neighbors)
final_estimation = self.baseline_estimate_txci + self.cf_txci
return final_estimation
def score(self):
"""
Calculate RMSE Score
"""
rxi = np.array([self.baseline_estimate_txci + self.cf_txci]).reshape(1, 1)
rxr = np.array([self.r_matrix[self.country_idx, self.taster_idx]]).reshape(1, 1)
return root_mean_squared_error(rxi, rxr)
In [61]:
rating_estimator = RatingEstimator(k_neighbors=k)
In [62]:
rating_estimator = rating_estimator.fit(r_train)
In [63]:
rating_estimator.predict(tx, ci)
Out[63]:
89.26913061143844
In [64]:
rating_estimator.score()
Out[64]:
0.09807797985949662
Checking it on validation set¶
In [65]:
# Get indices where the value is not np.nan
valid_indices = np.where(~np.isnan(r_valid))
valid_paired_indices = list(zip(valid_indices[0], valid_indices[1]))
In [66]:
r_pred = np.full(r_valid.shape, np.nan)
for i, j in valid_paired_indices:
rating_estimator = RatingEstimator(k_neighbors=5)
rating_estimator.fit(r_train)
pred = rating_estimator.predict(i, j)
r_pred[i, j] = pred
In [67]:
root_mean_squared_error(r_pred, r_valid)
Out[67]:
3.0551220503792593
Learning curve on k-neighbors¶
In [68]:
train_indices = np.where(~np.isnan(r_train))
train_paired_indices = list(zip(train_indices[0], train_indices[1]))
In [69]:
r_train_pred = np.full(r_valid.shape, np.nan)
r_valid_pred = np.full(r_valid.shape, np.nan)
k_list = np.arange(1, 10)
train_rmse_list = list()
valid_rmse_list = list()
for k in k_list:
rating_estimator = RatingEstimator(k_neighbors=k)
rating_estimator = rating_estimator.fit(r_train)
for i, j in train_paired_indices:
pred = rating_estimator.predict(i, j)
r_train_pred[i, j] = pred
train_rmse_list.append(root_mean_squared_error(r_train_pred, r_train))
for i, j in valid_paired_indices:
pred = rating_estimator.predict(i, j)
r_valid_pred[i, j] = pred
valid_rmse_list.append(root_mean_squared_error(r_valid_pred, r_valid))
In [70]:
f, ax = plt.subplots(1, 2, figsize=(18, 4))
ax[0].plot(k_list, train_rmse_list)
ax[1].plot(k_list, valid_rmse_list, color='salmon')
ax[0].set_title('train')
ax[1].set_title('valid')
Out[70]:
Text(0.5, 1.0, 'valid')
Now in test-set with best k¶
In [71]:
k = 2
rating_estimator = RatingEstimator(k_neighbors=k)
rating_estimator = rating_estimator.fit(r_train_holdout)
r_train_holdout_pred = np.full(r_valid.shape, np.nan)
train_holdout_indices = np.where(~np.isnan(r_train))
train_holdout_paired_indices = list(zip(train_holdout_indices[0], train_holdout_indices[1]))
for i, j in train_holdout_paired_indices:
pred = rating_estimator.predict(i, j)
r_train_holdout_pred[i, j] = pred
root_mean_squared_error(r_train_holdout_pred, r_train_holdout)
Out[71]:
2.252778500002268
That's quite ok, perfome better than valid set.
Making recommendations¶
In [72]:
print(f"To test we will recommend nationalities of wines for {df_rank.index[tx]}")
To test we will recommend nationalities of wines for michael schachner
In [73]:
# select only nationalitys that he don't taste yet
null_values = np.where(np.isnan(sparse_matrix[:, tx]))
In [74]:
# refit
final_rating_estimator = RatingEstimator(k_neighbors=k)
final_rating_estimator = final_rating_estimator.fit(sparse_matrix)
ratings_preds = {}
for i in null_values[0]:
pred = rating_estimator.predict(i, tx)
ratings_preds[df_rank.columns[i]] = pred
df_ratings_preds = pd.DataFrame(ratings_preds, index=['ratings_pred']).T
In [75]:
df_ratings_preds.sort_values(by='ratings_pred', ascending=False)[:5]
Out[75]:
| ratings_pred | |
|---|---|
| bosnia and herzegovina | 92.569631 |
| australia | 90.157520 |
| cyprus | 89.100440 |
| georgia | 89.097209 |
| greece | 88.790578 |